library(forecast)
library(fpp2)
## Loading required package: fma
##
## Attaching package: 'fma'
## The following object is masked from 'package:plyr':
##
## ozone
## Loading required package: expsmooth
library(broom)
library(tidyverse)
library(sweep)
library(tseries)
detach("package:dplyr", unload=TRUE)
## Warning: 'dplyr' namespace cannot be unloaded:
## namespace 'dplyr' is imported by 'timetk', 'tigris', 'broom', 'tidycensus', 'sweep' so cannot be unloaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
## The following object is masked from 'package:fma':
##
## pigs
library(e1071)
library(fpp2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
theme_set(theme_minimal())
\[y_t = \beta_0 + \beta_1x_t + \epsilon_t\]
!(image){https://otexts.com/fpp2/fpp_files/figure-html/SLRpop1-1.png}
autoplot(uschange[,c("Consumption","Income")]) +
labs(x = "% change", y = "Year")
uschange %>%
as.data.frame() %>%
ggplot(aes(Income, Consumption)) +
geom_point() +
geom_smooth(method = "lm", se = F) +
labs(x = "Income (quarterly % change)",
y = "Consumption (quarterly % chnage)")
tslm(Consumption ~ Income, data = uschange)
##
## Call:
## tslm(formula = Consumption ~ Income, data = uschange)
##
## Coefficients:
## (Intercept) Income
## 0.5451 0.2806
The interpretation of the intercept required that a vlaue of x = o make sense.
In this case when x = o, mean when income change equal to zero since last quarter, the predicted value of y is .55. Even when x = o make no sense, the intercept is an important part of the model.
autoplot(uschange, facets = T)
uschange %>%
as.data.frame() %>%
ggpairs()
tslm() function fit a linear regressional model to time series data, compare to lm model, tslm provides additional facilities for handling time series.
fit_cos <- tslm(Consumption ~ Income + Production + Unemployment + Savings, data = uschange)
summary(fit_cos)
##
## Call:
## tslm(formula = Consumption ~ Income + Production + Unemployment +
## Savings, data = uschange)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88296 -0.17638 -0.03679 0.15251 1.20553
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.26729 0.03721 7.184 1.68e-11 ***
## Income 0.71449 0.04219 16.934 < 2e-16 ***
## Production 0.04589 0.02588 1.773 0.0778 .
## Unemployment -0.20477 0.10550 -1.941 0.0538 .
## Savings -0.04527 0.00278 -16.287 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3286 on 182 degrees of freedom
## Multiple R-squared: 0.754, Adjusted R-squared: 0.7486
## F-statistic: 139.5 on 4 and 182 DF, p-value: < 2.2e-16
autoplot(uschange[, 'Consumption'], series = "Data") +
autolayer(fitted(fit_cos), series = "Fitted") +
labs(x = "Year", y = "", title = "Percent change in US consumption expenditure")
cbind(Data = uschange[, 'Consumption'],
Fitted = fitted(fit_cos)) %>%
as.data.frame() %>%
ggplot(aes(Data, Fitted)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(x = "Fitted value",
y = "Actual Value")
\[\hat{\sigma_e} = \sqrt{\frac{1}{T-k-1}\sum_{t=1}^T{e_t}^2}\]
checkresiduals(fit_cos)
##
## Breusch-Godfrey test for serial correlation of order up to 8
##
## data: Residuals from Linear regression model
## LM test = 14.874, df = 8, p-value = 0.06163
kpss.test(residuals(fit_cos))
##
## KPSS Test for Level Stationarity
##
## data: residuals(fit_cos)
## KPSS Level = 0.51799, Truncation lag parameter = 4, p-value =
## 0.03762
The time plot show some chagne variation over time, but is otherwise relatively unremarkable. This heteroscedascity will potentially make the predication interval coverage inaccurate.
THe histogram show that the residauls seem to be slightly skewed, which may also affect the coverage probability of the predication intervals.
The autocorrelation plot show a significant at the 5% level, but still not quite enought for the Breuch-Godfrey to be signiciant at the 5% level.
df <- as.data.frame(uschange)
df[, "Residuals"] <- as.numeric(residuals(fit_cos))
df %>%
select(-Consumption) %>%
gather(Income:Unemployment, key = "key", value = "value") %>%
ggplot(aes(value, Residuals)) +
geom_point() +
facet_wrap( ~ key, scales = "free")
augment(fit_cos) %>%
ggplot(aes(.fitted, .resid)) +
geom_point() +
labs(x = "Fitted", y = "Residual")
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
Outlier - observation taht take extreme values compare to the majority of the data are called outliers
Influential observation: Have a large influence on the estimated coefficients of a regressional model
“non-stationary” the value of the time series do not flucturate around a constant mean or with a constant variance.
aussies <- window(ausair, end=2011)
fit <- tslm(aussies ~ guinearice)
summary(fit)
##
## Call:
## tslm(formula = aussies ~ guinearice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9448 -1.8917 -0.3272 1.8620 10.4210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.493 1.203 -6.229 2.25e-07 ***
## guinearice 40.288 1.337 30.135 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.239 on 40 degrees of freedom
## Multiple R-squared: 0.9578, Adjusted R-squared: 0.9568
## F-statistic: 908.1 on 1 and 40 DF, p-value: < 2.2e-16
checkresiduals(fit)
##
## Breusch-Godfrey test for serial correlation of order up to 8
##
## data: Residuals from Linear regression model
## LM test = 28.813, df = 8, p-value = 0.000342
beer2 <- window(ausbeer, start = 1992)
autoplot(beer2) +
labs(x = "Year", y = "Megalitres")
autoplot(snaive(beer2, drift = T)) +
labs(x = "Year", y = "Megalitres")
We can see this time series plot exhibit trend and seasonality. We can model this data using a regression model with a linear trend and quarterly dummary variables.
\(y_t = \beta_0 + \beta_1t +\beta_2d_{2,t} + \beta_3d_{3,t} + \beta_4d_{4,t} + \epsilon_t\)
where \(d_{i,t} = 1\) if t is in i and o otherwise.
fit_beer <- tslm(beer2 ~ trend + season)
summary(fit_beer)
##
## Call:
## tslm(formula = beer2 ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.903 -7.599 -0.459 7.991 21.789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 441.80044 3.73353 118.333 < 2e-16 ***
## trend -0.34027 0.06657 -5.111 2.73e-06 ***
## season2 -34.65973 3.96832 -8.734 9.10e-13 ***
## season3 -17.82164 4.02249 -4.430 3.45e-05 ***
## season4 72.79641 4.02305 18.095 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.23 on 69 degrees of freedom
## Multiple R-squared: 0.9243, Adjusted R-squared: 0.9199
## F-statistic: 210.7 on 4 and 69 DF, p-value: < 2.2e-16
autoplot(beer2, series = "Data") +
autolayer(fitted(fit_beer), series = "Fitted") +
labs(x = "Year", y = "Megalitres",
title = "Quarterly Beer Production")
augment(fit_beer) %>%
ggplot(aes(beer2, .fitted, color = season)) +
geom_point() +
labs(x = "Actual Value",
y = "Fitted",
title = "Quarterly beer production") +
geom_abline(slope = 1, intercept = 1) +
scale_color_brewer(palette = "Dark2", name = "Quarter")
augment(fit_beer)
## # A tibble: 74 x 9
## beer2 trend season .fitted .resid .hat .sigma .cooksd .std.resid
## <dbl> <int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 443 1 1 441. 1.54 0.0910 12.3 0.000349 0.132
## 2 410 2 2 406. 3.54 0.0910 12.3 0.00185 0.304
## 3 420 3 3 423. -2.96 0.0898 12.3 0.00127 -0.254
## 4 532 4 4 513. 18.8 0.0898 12.1 0.0510 1.61
## 5 433 5 1 440. -7.10 0.0830 12.3 0.00665 -0.606
## 6 421 6 2 405. 15.9 0.0830 12.2 0.0334 1.36
## 7 410 7 3 422. -11.6 0.0822 12.2 0.0176 -0.990
## 8 512 8 4 512. 0.125 0.0822 12.3 0.00000205 0.0107
## 9 449 9 1 439. 10.3 0.0759 12.3 0.0125 0.873
## 10 381 10 2 404. -22.7 0.0759 12.0 0.0614 -1.93
## # … with 64 more rows
Not recommand:
plot individual x and y to see if there exist some noticeable relationship and decide whether or not drop this x Because it is not alway possible to see relatioship from a scatterplot.
do a multiple linear regression on all the x and disregard all variables who p-value are greater than .05. To start with, statistical significance does not alway indicate predcitive value.
Instead, we will use a measure of predictive accuracy. in CV function
CV(fit_cos)
## CV AIC AICc BIC AdjR2
## 0.1163477 -409.2980298 -408.8313631 -389.9113781 0.7485856
\(SSE = \sum_{t=1}^{T}{e_t}^2\)
adjusted SSE
\(\bar{R}^2 = 1 - (1- R^2)\frac{T-1}{T-k-1}\)
recommand use AICc, AIC or CV
Ex-ante forecast are those that are mde using only the information that is available in advance. These are genuine forecasts, made in advance using whatever information is avaibale at the time.
Ex-post forecast useing later information on the predictors. ex-post forecast of consumption may use the actual observations of the predictors. These are not genuine forecast, but are useful for studying the behaviour of forecasting models.
fit_beer <- tslm(beer2 ~ trend + season)
forecast(fit_beer) %>%
autoplot() +
labs(x = "Year", y = "Megalitres", title = "Forecast of beer production usign regression")
fit.consBest <- tslm(
Consumption ~ Income + Savings + Unemployment,
data = uschange)
h <- 4
newdata <- data.frame(
Income = c(1, 1, 1, 1),
Savings = c(0.5, 0.5, 0.5, 0.5),
Unemployment = c(0, 0, 0, 0))
fcast.up <- forecast(fit.consBest, newdata = newdata)
newdata <- data.frame(
Income = rep(-1, h),
Savings = rep(-0.5, h),
Unemployment = rep(0, h))
fcast.down <- forecast(fit.consBest, newdata = newdata)
autoplot(uschange[, 1]) +
ylab("% change in US consumption") +
autolayer(fcast.up, PI = TRUE, series = "increase") +
autolayer(fcast.down, PI = TRUE, series = "decrease") +
guides(colour = guide_legend(title = "Scenario"))
fit_cons <- tslm(Consumption ~ Income, data = uschange)
fcast_avg <- forecast(fit_cons, newdata = data.frame(Income = rep(mean(uschange[, "Income"]), 4)))
fcast_up <- forecast(fit_cons, newdata = data.frame(Income = rep(5, 4)))
autoplot(uschange[, "Income"]) +
autolayer(fcast_avg, series = "Average") +
autolayer(fcast_up, series = "Upper") +
labs(colour = guide_legend(title = "Scenario"))
h <- 10
fit_lin <- tslm(marathon ~ trend)
fcast_line <- forecast(fit_lin, h = h)
fit_exp <- tslm(marathon ~ trend, lambda = 0)
fcast_exp <- forecast(fit_exp, h = h)
t <- time(marathon)
t.break1 <- 1940
t.break2 <- 1980
tb1 <- ts(pmax(0, t - t.break1), start = 1897)
tb2 <- ts(pmax(0, t - t.break2), start = 1897)
fit.pw <- tslm(marathon ~ t + tb1 + tb2)
t.new <- t[length(t)] + seq(h)
tb1.new <- tb1[length(tb1)] + seq(h)
tb2.new <- tb2[length(tb2)] + seq(h)
newdata <- cbind(t=t.new, tb1=tb1.new, tb2=tb2.new) %>%
as.data.frame()
fcasts.pw <- forecast(fit.pw, newdata = newdata)
fit.spline <- tslm(marathon ~ t + I(t^2) + I(t^3) +
I(tb1^3) + I(tb2^3))
fcasts.spl <- forecast(fit.spline, newdata = newdata)
autoplot(marathon) +
autolayer(fitted(fit_lin), series = "Linear") +
autolayer(fitted(fit_exp), series = "Exponential") +
autolayer(fitted(fit.pw), series = "Piecewise") +
autolayer(fitted(fit.spline), series = "Cubic Spline") +
autolayer(fcasts.pw, series="Piecewise") +
autolayer(fcast_line, series="Linear", PI=FALSE) +
autolayer(fcast_exp, series="Exponential", PI=FALSE) +
autolayer(fcasts.spl, series="Cubic Spline", PI=FALSE) +
labs(y = "Year", x= "Winning times in minutes")
marathon %>%
splinef(lambda = 0) %>%
autoplot()
marathon %>%
splinef(lambda = 0) %>%
checkresiduals()
## Warning in modeldf.default(object): Could not find appropriate degrees of
## freedom for this model.
It’s important to understand that correlations are useful for forecasting, even when there is no causal relationship between the two variables.
We say that two variables are confounded when their effects on the forecast variable cannot be separated.
Confounding is not really a problem for forecasting, as we can still compute forecasts without needing to separate out then effects of the predictors. It will become a problem with scenario forecasting as the scenarios should take accountof the relationships between predcitros.
When multicollinearity is present, the uncertainity associtated with individual regression coefficeitns will be large.
Forecasts will be unrealiable if the values of the future predictors are outside the range of the historical values of the predictors.
daily20 <- head(elecdaily, 20)
autoplot(daily20[, c("Demand", "Temperature")])
# fit lm model
eclec_fit <- tslm(Demand ~ Temperature, data = daily20)
summary(eclec_fit)
##
## Call:
## tslm(formula = Demand ~ Temperature, data = daily20)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.060 -7.117 -1.437 17.484 27.102
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.2117 17.9915 2.179 0.0428 *
## Temperature 6.7572 0.6114 11.052 1.88e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22 on 18 degrees of freedom
## Multiple R-squared: 0.8716, Adjusted R-squared: 0.8644
## F-statistic: 122.1 on 1 and 18 DF, p-value: 1.876e-09
There exist a positive relationship between temperature and elec demand, because people use more elec during hot temperature.
checkresiduals(eclec_fit)
##
## Breusch-Godfrey test for serial correlation of order up to 5
##
## data: Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774
There time plot show some chaing variation over time, will potentially make the prediction interval inaccurate. And rresiduals are seem to be skewed, which may affect the coverage probability of the prediction intervals.
daily20 %>%
as.data.frame() %>%
ggplot(aes(Temperature, y = Demand)) +
geom_point() +
geom_smooth(method = "lm", se = F)
daily20 %>%
as.data.frame() %>%
ggplot(aes(Temperature)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
I probabily will not believe the prediction because this range exceed my training dataset boundaries.
elec_predict <- forecast(eclec_fit, newdata = data.frame(Temperature = seq(15, 35, by = 1)))
elecdaily %>%
as.data.frame() %>%
ggplot(aes(Temperature, y = Demand)) +
geom_point() +
geom_smooth(method = "lm", se = F)
The model I have only have data avaibale from 20 degree to 40 degree and data outside this range have different pattern than data inside the range.
Winning times in Olympic men’s 400m track final.
mens400_df <- mens400 %>%
tbl_df() %>%
mutate(year = row_number() * 4 + 1892)
mens400_df %>%
ggplot(aes(year, x)) +
geom_line()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
Time for finish 400 m track final reduce drastically from 1896 to 1920. it is a downward slop.
fit a lm model to see teh wining times decrease at what speed
times_fit <- tslm(x ~ year, mens400_df)
times_fit
##
## Call:
## tslm(formula = x ~ year, data = mens400_df)
##
## Coefficients:
## (Intercept) year
## 172.48148 -0.06457
Finishing time decrease at around 0.06 per year
times_fit %>%
augment() %>%
ggplot(aes(year, .resid)) +
geom_line()
## Warning in augment_columns(x, data, newdata, type.predict = type.predict, :
## When fitting with na.exclude, rows with NA in original data will be dropped
## unless those rows are provided in 'data' argument
This indicated the prediction interval of this line is not stable.
mens400_ts <- ts(mens400_df[, c("x")], start = 1896, frequency = 1/4)
times_fit <- tslm(x ~ trend, data = mens400_ts)
autoplot(mens400_ts)+
autolayer(forecast(times_fit, h = 2)) +
labs(x = "Year", y = "Seconds")
forecast(times_fit, h = 2)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2020 42.04231 40.44975 43.63487 39.55286 44.53176
## 2024 41.78402 40.18198 43.38605 39.27976 44.28827
The assumption I made in the calculation is that in foreseeable future this lienar trend will sustain.
head(easter(ausbeer), 20)
## Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00
some times easter holiday only occur in one quarter, some times it exist in more than one quarter.
autoplot(fancy)
# we need to adjust dailyaverage since different months have different days
fancy_df <- cbind(Monthly = fancy,
Dailyaverage = fancy / monthdays(fancy))
autoplot(fancy_df, facet = T) +
labs(x = "year", y = "Monthly sales") +
scale_y_continuous(labels = dollar_format())
During 92-94, we see a much large growth than previous years growth rate.
if we want to fit a liear model it will be unreasonable becuase the data show variation that increase with teh level of the series. So transformation is reasonable.
BoxCox transformation \(y_t = (\lambda*w_t + 1)^{1/\lambda}\), \(y_t\) is original data, \(w_t\) is transformed data.
lambda <- BoxCox.lambda(fancy)
fancy_trans <- BoxCox(fancy, lambda = lambda)
autoplot(fancy_trans)
fancy_lm <- tslm(fancy_trans ~ trend + season)
augment(fancy_lm) %>%
ggplot(aes(.fitted, .resid)) +
geom_point()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
checkresiduals(fancy_lm)
##
## Breusch-Godfrey test for serial correlation of order up to 16
##
## data: Residuals from Linear regression model
## LM test = 37.186, df = 16, p-value = 0.001974
The plots do not reveal any problems with the model
fit_resid <- data.frame(resids = residuals(fancy_lm),
month = rep(1:12, 7))
fit_resid %>%
ggplot(aes(factor(month), resids)) +
geom_boxplot()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.
No it does not reveal any problem
tidy(fancy_lm)
## # A tibble: 13 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 7.67 0.0784 97.7 1.93e-77
## 2 trend 0.0228 0.000862 26.5 1.52e-38
## 3 season2 0.255 0.101 2.52 1.39e- 2
## 4 season3 0.708 0.101 6.99 1.23e- 9
## 5 season4 0.390 0.101 3.85 2.59e- 4
## 6 season5 0.415 0.101 4.10 1.10e- 4
## 7 season6 0.455 0.101 4.49 2.72e- 5
## 8 season7 0.620 0.101 6.11 4.87e- 8
## 9 season8 0.596 0.102 5.88 1.25e- 7
## 10 season9 0.679 0.102 6.68 4.42e- 9
## 11 season10 0.758 0.102 7.46 1.67e-10
## 12 season11 1.23 0.102 12.1 7.44e-19
## 13 season12 2.00 0.102 19.6 2.08e-30
checkresiduals(fancy_lm)
##
## Breusch-Godfrey test for serial correlation of order up to 16
##
## data: Residuals from Linear regression model
## LM test = 37.186, df = 16, p-value = 0.001974
ggsubseriesplot(fancy_lm$residuals)
The autocorrelation plot show a significant spike at lag 9, 24, but it is not quite enough for the Breush-Godfrey to be significant at the 5% level. In any case the autocorrelationship is not particularly large, and at lag 9 and 24 it is unlikely to have any noticeable impact on the forecasts or prediction intervals.
fancy_predict <- forecast(fancy_lm, h = 36)
autoplot(fancy_predict)
gasoline_2014 <- window(gasoline, end = 2005)
autoplot(gasoline_2014)
for(num in c(1, 2, 3, 5, 10, 20)) {
var_name <- paste("tslm_fit_",
num,
"_gasoline_2004",
sep = "")
plot_name <- paste("plot",
num,
sep = "_")
assign(var_name,
tslm(gasoline_2014 ~ trend + fourier(
gasoline_2014,
K = num
)))
assign(plot_name, autoplot(gasoline_2014) +
autolayer(get(var_name)$fitted.values,
series = as.character(num)) +
labs(title = var_name,
y = "gasoline",
subtitle = paste("fourier #", num)) +
theme(legend.position = "none"))
}
grid.arrange(plot_1, plot_2, plot_3, plot_5,plot_10, plot_20, ncol = 2)
p <- list(tslm_fit_1_gasoline_2004, tslm_fit_2_gasoline_2004, tslm_fit_3_gasoline_2004,
tslm_fit_5_gasoline_2004, tslm_fit_10_gasoline_2004, tslm_fit_20_gasoline_2004)
cv_result <- lapply(p, CV) %>%
as.data.frame()
names(cv_result) <- paste0("CV", c(1,2,3,5,10,20), sep = "")
cv_result <- add_column(cv_result, names = rownames(cv_result))
cv_result %>%
filter(names %in% c("AIC", "CV")) %>%
gather(CV1:CV20, key = "key", value = "value")
## names key value
## 1 CV CV1 8.201921e-02
## 2 AIC CV1 -1.813292e+03
## 3 CV CV2 8.136854e-02
## 4 AIC CV2 -1.819113e+03
## 5 CV CV3 7.658846e-02
## 6 AIC CV3 -1.863085e+03
## 7 CV CV5 7.553646e-02
## 8 AIC CV5 -1.873234e+03
## 9 CV CV10 7.135754e-02
## 10 AIC CV10 -1.915014e+03
## 11 CV CV20 7.223834e-02
## 12 AIC CV20 -1.908017e+03
tslm_ft7_gasoline_2004 <- tslm(
gasoline_2014 ~ trend + fourier(
gasoline_2014,
K = 7))
checkresiduals(tslm_ft7_gasoline_2004)
##
## Breusch-Godfrey test for serial correlation of order up to 104
##
## data: Residuals from Linear regression model
## LM test = 149.31, df = 104, p-value = 0.002404
fc_gasoline_2005 <- forecast(tslm_ft7_gasoline_2004, newdata = data.frame(fourier(gasoline_2014, K = 7, h = 52)))
autoplot(fc_gasoline_2005) +
autolayer(window(gasoline, start = 2005, end = 2006) , series = "Forecast")
Almost all the actual data fall into 80% prediction interval, but the prediction couldn’t predict the sudden dip in the fall.
autoplot(huron)
The chart exhibit a downward trend from 1875 - 1930. But after that the trned disappear.
# simple linear trend
h = 10
tslm_huron <- tslm(huron ~ trend)
fc_tslm <- forecast(tslm_huron, h = h)
# power transformation
lambda <- BoxCox.lambda(huron)
huron_box <- BoxCox(huron, lambda)
tslm_log_huron <- tslm(huron_box ~ trend)
fc_box_tslm <- forecast(tslm_log_huron, h = h)
# piecewise linear trend
t <- time(huron)
t.break <- 1915
t_piece <- ts(pmax(0,t-t.break), start=1875)
pw_huron <- tslm(huron ~ t + t_piece)
t_new <- t[length(t)]+seq(h)
t_piece_new <- t_piece[length(t_piece)]+seq(h)
newdata <- cbind(t=t_new,
t_piece=t_piece_new) %>%
as.data.frame()
fc_pw_huron <- forecast(
pw_huron,
newdata = newdata
)
# cubin spline
spline_huron <- tslm(huron ~ t + I(t^2) + I(t^3) +I(t_piece^2) + I(t_piece^3))
fc_spline_huron <- forecast(
spline_huron,
newdata = newdata
)
autoplot(huron) +
autolayer(fitted(fc_tslm), series = "Linear") +
autolayer(fitted(fc_box_tslm), series = "Expontial") +
autolayer(fitted(fc_pw_huron), series = "PieceWise") +
autolayer(fitted(fc_spline_huron), series = "Cubic Spline") +
autolayer(fc_tslm, series = "Linear", PI = F) +
autolayer(fc_box_tslm, series = "Expontial", PI = F) +
autolayer(fc_pw_huron, series = "PieceWise", PI = F) +
autolayer(fc_spline_huron, series = "Cubic Spline", PI = F)
p1 <- autoplot(huron) +
autolayer(fitted(tslm_huron), series = "LM") +
autolayer(fc_tslm, series = "linear") +
labs(title = "LM model") +
theme(legend.position = "none")
p2 <- autoplot(huron) +
autolayer(fitted(pw_huron), series = "PieceWise") +
autolayer(fc_pw_huron, series = "PieceWise") +
labs(title = "PieceWise model") +
theme(legend.position = "none")
grid.arrange(p1, p2, ncol = 2)
Linear regression trend does not capture the trend change around 1915. PieceWise regression capture the trend change around 1915